## Rows: 104
## Columns: 5
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ dept_id <dbl> 6, 4, 3, 4, 2, 6, 3, NA, 3, 6, 6, 6, 4, 4, 6, 3, 5, 4, 4,...
## $ dept_name <chr> "Production", "Business Development", "Design", "Business...
## $ name <chr> "B****, Li*** N.", "C******, Jane G.", "E******, Ma* B.",...
## $ salary <chr> "91100", "173900", "163100", "71000", "111100", "80600", ...
#simple cleaning
#a = a %>% select(-id, -name)
a = a %>% mutate(
salary = as.numeric(salary),
dept_id = as.factor(dept_id)
)
#dept_id should be a factor/category variable
#especially import to convert if imputation is done
a %>% heada %>% count(dept_name, dept_id, name = 'count') %>% mutate(percent = count/sum(count)) %>% arrange(dept_name, -percent)#correct dept misspellings
a = a %>% mutate(
dept_name = if_else(dept_name == 'Business Developement', 'Business Development', dept_name),
dept_name = if_else(dept_name == 'Producdion', 'Production', dept_name),
) %>% mutate(
dept_name = factor(dept_name)
)
#check
a %>% select(dept_name) %>% freq(dept.mapping = tibble(
dept_id = a %>% pull(dept_id) %>% unique() %>% sort,
dept_name = c('Human Resources', 'Design', 'Business Development', 'Accounting', 'Production'),
))
But . . . there’s a faster and more automated way! Imputation via Machine Learning, specifically the Random Forest Algorithm (both Classification and Regression).
missing.dept_name.ids = a %>% filter(is.na(dept_name)) %>% pull(id)
# these rows are missing for dept_id
a %>% filter(is.na(dept_id))missing.dept_id.ids = a %>% filter(is.na(dept_id)) %>% pull(id)
# these rows are missing for salary
a %>% filter(is.na(salary))a.imputed %>% group_by(dept_name) %>% summarise(
mean.salary = mean(salary),
median.salary = median(salary)
)## `summarise()` ungrouping output (override with `.groups` argument)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = salary ~ dept_name, data = a.imputed)
##
## $dept_name
## diff lwr upr p adj
## Business Development-Accounting 9080.400 -27368.406 45529.2059 0.9575614
## Design-Accounting 46343.787 11206.158 81481.4153 0.0036298
## Human Resources-Accounting 6247.651 -37663.538 50158.8409 0.9947385
## Production-Accounting 13175.588 -19715.006 46066.1816 0.7986801
## Design-Business Development 37263.387 5869.816 68656.9577 0.0116113
## Human Resources-Business Development -2832.749 -43809.521 38144.0243 0.9996893
## Production-Business Development 4095.188 -24761.259 32951.6345 0.9947901
## Human Resources-Design -40096.135 -79911.125 -281.1455 0.0475253
## Production-Design -33168.199 -60349.663 -5986.7345 0.0087443
## Production-Human Resources 6927.937 -30918.749 44774.6222 0.9862736
TukeyHSD(anova.salary.dept) %>% tidy %>% filter(adj.p.value < 0.05) %>% select(contrast, adj.p.value)
Looking at the salary distribution by dept viz above, the results above make sense
The ‘Design’ dept clearly has a median pay above that of the other departments. Even more telling, its first quartile, at $103,250 is nearly higher than the median of 3 other departments.